This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.ClassifierSearchSpace(
data=X_train,
models=[hlp.sklearn_search.ClassifierSearchSpaceModels.LogisticRegression],
iterations=[50],
random_state=42,
)
# pip install scikit-optimize
from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 56.335 seconds; 0.9 minutes
print(bayes_search.best_score_)
0.7736194546305393
print(bayes_search.best_params_)
OrderedDict([('model', LogisticRegression(C=0.1749996766322668, max_iter=1000, random_state=42)), ('model__C', 0.1749996766322668), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_score
0.7736194546305393
results.best_params
{'model': 'LogisticRegression()',
'C': 0.1749996766322668,
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100, include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | encoder |
|---|---|---|---|---|---|---|---|
| 1 | 0.774 | 0.745 | 0.802 | 0.175 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 0.772 | 0.746 | 0.797 | 0.175 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 0.771 | 0.744 | 0.799 | 0.046 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 4 | 0.771 | 0.755 | 0.786 | 0.238 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 5 | 0.771 | 0.750 | 0.791 | 0.022 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 6 | 0.770 | 0.740 | 0.801 | 0.078 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 7 | 0.770 | 0.750 | 0.791 | 0.284 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 8 | 0.767 | 0.745 | 0.789 | 0.109 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 9 | 0.767 | 0.736 | 0.797 | 1.548 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 10 | 0.767 | 0.739 | 0.794 | 0.240 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 11 | 0.766 | 0.752 | 0.780 | 1.596 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 12 | 0.766 | 0.745 | 0.787 | 1.137 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 13 | 0.766 | 0.740 | 0.791 | 0.141 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 14 | 0.766 | 0.743 | 0.788 | 0.213 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 15 | 0.765 | 0.753 | 0.777 | 0.099 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 16 | 0.765 | 0.733 | 0.797 | 1.593 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 17 | 0.764 | 0.738 | 0.789 | 0.234 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 18 | 0.764 | 0.737 | 0.790 | 0.035 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 19 | 0.763 | 0.748 | 0.779 | 22.376 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 20 | 0.762 | 0.739 | 0.785 | 3.170 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 21 | 0.762 | 0.726 | 0.798 | <NA> | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 22 | 0.762 | 0.735 | 0.789 | 0.769 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 23 | 0.762 | 0.732 | 0.792 | 0.454 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 24 | 0.760 | 0.737 | 0.782 | 99.780 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 25 | 0.759 | 0.725 | 0.794 | 99.886 | SimpleImputer(strategy='median') | StandardScaler() | OneHotEncoder() |
| 26 | 0.759 | 0.741 | 0.777 | 0.307 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 27 | 0.758 | 0.730 | 0.787 | 0.158 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 28 | 0.758 | 0.724 | 0.792 | 0.005 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 29 | 0.758 | 0.735 | 0.782 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 30 | 0.757 | 0.742 | 0.773 | 99.531 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 31 | 0.756 | 0.740 | 0.773 | 0.010 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 32 | 0.756 | 0.729 | 0.782 | 0.000 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 33 | 0.754 | 0.718 | 0.790 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 34 | 0.753 | 0.727 | 0.779 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 35 | 0.753 | 0.724 | 0.782 | 99.886 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 36 | 0.752 | 0.717 | 0.787 | 32.731 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 37 | 0.752 | 0.725 | 0.779 | 0.005 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 38 | 0.752 | 0.725 | 0.779 | 22.913 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 39 | 0.748 | 0.730 | 0.766 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 40 | 0.732 | 0.714 | 0.751 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 41 | 0.732 | 0.708 | 0.756 | 0.403 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 42 | 0.732 | 0.703 | 0.760 | 0.003 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 43 | 0.731 | 0.708 | 0.754 | 99.935 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 44 | 0.731 | 0.710 | 0.752 | 3.489 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 45 | 0.730 | 0.711 | 0.748 | 99.180 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 46 | 0.728 | 0.695 | 0.762 | 0.108 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 47 | 0.728 | 0.698 | 0.759 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 48 | 0.722 | 0.703 | 0.740 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 49 | 0.720 | 0.696 | 0.743 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 50 | 0.704 | 0.677 | 0.732 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 51 | 0.700 | 0.658 | 0.742 | 0.000 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.trial_rankings
array([21, 19, 49, 42, 36, 11, 38, 41, 44, 34, 51, 47, 6, 13, 3, 37, 26,
43, 7, 23, 18, 25, 22, 24, 32, 33, 50, 1, 45, 40, 8, 31, 16, 48,
27, 39, 12, 10, 30, 35, 20, 14, 15, 9, 46, 4, 17, 29, 28, 5, 2])
# gives the
# e.g. results.best_trial_indexes of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because trial 2 (i.e. 3rd trial) was the best, so it is the first index;
# and index 0 (i.e. first trial) was the was
results.best_trial_indexes
array([27, 50, 14, 45, 49, 12, 18, 30, 43, 37, 5, 36, 13, 41, 42, 32, 46,
20, 1, 40, 0, 22, 19, 23, 21, 16, 34, 48, 47, 38, 31, 24, 25, 9,
39, 4, 15, 6, 35, 29, 7, 3, 17, 8, 28, 44, 11, 33, 2, 26, 10])
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /var/folders/7x/wc3jx_91337bggbzk01kpvs40000gn/T/ipykernel_44460/3729232008.py in <module> ----> 1 results.plot_performance_across_trials(size='C').show() ~/repos/python-helpers/helpsk/sklearn_eval.py in plot_performance_across_trials(self, size, color, color_continuous_scale, facet_by, facet_num_col, query, height, width) 1018 ascending=True) 1019 -> 1020 fig = px.scatter( 1021 data_frame=labeled_df, 1022 x='Trial Index', ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/express/_chart_types.py in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, template, width, height) 64 mark in 2D space. 65 """ ---> 66 return make_figure(args=locals(), constructor=go.Scatter) 67 68 ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/express/_core.py in make_figure(args, constructor, trace_patch, layout_patch) 2112 args, trace_spec, group, mapping_labels.copy(), sizeref 2113 ) -> 2114 trace.update(patch) 2115 if fit_results is not None: 2116 trendline_rows.append(mapping_labels.copy()) ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in update(self, dict1, overwrite, **kwargs) 5082 BaseFigure._perform_update(self, kwargs, overwrite=overwrite) 5083 else: -> 5084 BaseFigure._perform_update(self, dict1, overwrite=overwrite) 5085 BaseFigure._perform_update(self, kwargs, overwrite=overwrite) 5086 ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in _perform_update(plotly_obj, update_obj, overwrite) 3898 # Update compound objects recursively 3899 # plotly_obj[key].update(val) -> 3900 BaseFigure._perform_update(plotly_obj[key], val) 3901 elif isinstance(validator, CompoundArrayValidator): 3902 if plotly_obj[key]: ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in _perform_update(plotly_obj, update_obj, overwrite) 3919 else: 3920 # Assign non-compound value -> 3921 plotly_obj[key] = val 3922 3923 elif isinstance(plotly_obj, tuple): ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in __setitem__(self, prop, value) 4817 # ### Handle simple property ### 4818 else: -> 4819 self._set_prop(prop, value) 4820 else: 4821 # Make sure properties dict is initialized ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in _set_prop(self, prop, val) 5161 return 5162 else: -> 5163 raise err 5164 5165 # val is None ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/plotly/basedatatypes.py in _set_prop(self, prop, val) 5156 5157 try: -> 5158 val = validator.validate_coerce(val) 5159 except ValueError as err: 5160 if self._skip_invalid: ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/_plotly_utils/basevalidators.py in validate_coerce(self, v) 783 ].tolist() 784 --> 785 self.raise_invalid_elements(some_invalid_els) 786 787 v = v_array # Always numeric numpy array ~/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/_plotly_utils/basevalidators.py in raise_invalid_elements(self, invalid_els) 303 def raise_invalid_elements(self, invalid_els): 304 if invalid_els: --> 305 raise ValueError( 306 """ 307 Invalid element(s) received for the '{name}' property of {pname} ValueError: Invalid element(s) received for the 'size' property of scatter.marker Invalid elements include: [nan] The 'size' property is a number and may be specified as: - An int or float in the interval [0, inf] - A tuple, list, or one-dimensional numpy array of the above
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 27 | 0.773619 | 0.175000 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 50 | 0.771631 | 0.174652 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 14 | 0.771423 | 0.046366 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 45 | 0.770928 | 0.237651 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 49 | 0.770668 | 0.022242 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'imputer': 'imputer',
'scaler': 'scaler',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.735
Model: OLS Adj. R-squared: 0.705
Method: Least Squares F-statistic: 24.37
Date: Sat, 12 Feb 2022 Prob (F-statistic): 1.14e-11
Time: 17:24:33 Log-Likelihood: 163.25
No. Observations: 50 AIC: -314.5
Df Residuals: 44 BIC: -303.0
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7232 0.005 143.176 0.000 0.713 0.733
imputer[T.SimpleImputer(strategy='median')] -0.0011 0.005 -0.235 0.815 -0.011 0.008
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0060 0.003 -1.877 0.067 -0.012 0.000
scaler[T.StandardScaler()] 0.0035 0.003 1.210 0.233 -0.002 0.009
encoder[T.OneHotEncoder()] 0.0377 0.004 9.114 0.000 0.029 0.046
C 2.638e-06 4.54e-05 0.058 0.954 -8.88e-05 9.4e-05
==============================================================================
Omnibus: 17.523 Durbin-Watson: 1.125
Prob(Omnibus): 0.000 Jarque-Bera (JB): 21.259
Skew: -1.334 Prob(JB): 2.42e-05
Kurtosis: 4.757 Cond. No. 186.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['imputer', 'scaler', 'encoder']
| roc_auc_Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 0 | 1.151779 | -0.423727 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 1 | 1.040178 | -0.423738 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 1.028505 | -0.42771 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 3 | 1.000682 | -0.421788 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 0.986078 | -0.428457 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.735
Model: OLS Adj. R-squared: 0.705
Method: Least Squares F-statistic: 24.37
Date: Sat, 12 Feb 2022 Prob (F-statistic): 1.14e-11
Time: 17:24:35 Log-Likelihood: -38.138
No. Observations: 50 AIC: 88.28
Df Residuals: 44 BIC: 99.75
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -1.6772 0.286 -5.871 0.000 -2.253 -1.101
imputer[T.SimpleImputer(strategy='median')] -0.0621 0.264 -0.235 0.815 -0.594 0.470
imputer[T.SimpleImputer(strategy='most_frequent')] -0.3372 0.180 -1.877 0.067 -0.699 0.025
scaler[T.StandardScaler()] 0.1961 0.162 1.210 0.233 -0.130 0.523
encoder[T.OneHotEncoder()] 2.1138 0.232 9.114 0.000 1.646 2.581
C 0.0048 0.082 0.058 0.954 -0.161 0.170
==============================================================================
Omnibus: 17.523 Durbin-Watson: 1.125
Prob(Omnibus): 0.000 Jarque-Bera (JB): 21.259
Skew: -1.334 Prob(JB): 2.42e-05
Kurtosis: 4.757 Cond. No. 7.94
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | -0.062118 | 8.150377e-01 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.337201 | 6.720605e-02 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.196125 | 2.326576e-01 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 2.113756 | 1.079491e-11 | True |
| C | C | 0.004783 | 9.538768e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 3.138 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.